library(plyr)
library(dplyr)
library(stringr)
library(ggplot2)
library(highcharter)
library(tm.plugin.webmining)
library(plotly)
library(tidytext)
library(gridExtra)
library(igraph)
library(widyr)
hols<-read.csv("HolidayTweets.csv",stringsAsFactors = F,header = T,quote = "")
text <- unlist(lapply(hols$Metadata,extractHTMLStrip))
text <- gsub("\n","",text)
data_tweets <- data.frame(text=text)
data_tweets$hashtags <- str_extract_all(data_tweets$text, "#\\S+")
data_tweets$num_hashtags <- unlist(lapply((data_tweets$hashtags),length))
data_tweets$clean_text <- gsub("[^[:alnum:] ]", "", data_tweets$text)
data_tweets$number_of_words <- sapply(gregexpr("\\W+", data_tweets$clean_text), length) + 1
data_tweets$picture <- ifelse(str_detect(data_tweets$text, "t.co"),"Picture/link", "No picture/link")
data_tweets$X <- seq(from=1,to=dim(data_tweets)[1],by=1)
The data set given consists of ID information and the HTML metadata. Here we use the extractHTMLStrip() function provided by the tm.plugin.webmining library.Upon extraction, we use several text processing tools to extract other information such as hashtags, number of words and whether pictures/links were used.
ggplot(data_tweets, aes(x=number_of_words)) + geom_histogram(binwidth = 1) + ggtitle("Number of Words Used")
DT::datatable(data_tweets %>% mutate(if_hashtag=ifelse(num_hashtags>0,"Yes","No"))%>%group_by(if_hashtag) %>% summarise(n=median(number_of_words)) )
We estimate that tweets with hashtags have at least 5 times more words used than tweets without hashtags.
data_tweets %>% group_by(picture)%>% summarise(n=n()) %>%
hchart("column",x=picture,y=log(n))
We see that most of the tweets did not contain pictures/links. The plots look comparable because of log scaling.
DT::datatable(data_tweets%>%group_by(picture) %>% summarise(n=median(number_of_words)) )
Tweets with picture/links tend to use more words based on the median estimate
library(tidyr)
library(RSentiment)
tweet <- data_tweets$clean_text
tweet = gsub("(f|ht)(tp)(s?)(://)(.*)[.|/](.*)", " ", tweet)
#retweet
tweet = gsub("(RT|via)((?:\\b\\W*@\\w+)+)", " ", tweet)
# removing hashtags
tweet = gsub("#\\w+", " ", tweet)
# removing @people
tweet = gsub("@\\w+", " ", tweet)
#removing punctuations
tweet = gsub("[[:punct:]]", " ", tweet)
#removing numbers
tweet = gsub("[[:digit:]]", " ", tweet)
#removing emojis
tweet<-str_replace_all(tweet,"[^[:graph:]]"," ")
tweet <- str_replace_all(tweet,'https'," ")
tweet <- str_replace_all(tweet,'amp'," ")
wordstoremove <- c("")
tweet <- sapply(tweet, function(x) gsub(paste(wordstoremove, collapse = '|'), '', x))
# removing non-english characters
#tweet1 <- grep('tweet',iconv(tweet,'latin1','ASCII',sub='tweet'))
data_tweets$clean_text_2 <- tweet
data_tweets$sentiment <- rep("",dim(data_tweets)[1])
for(i in 1:dim(data_tweets)[1])
{
data_tweets$sentiment[i] <- as.character(calculate_sentiment(data_tweets$clean_text_2[i])$sentiment)
}
pi <- data_tweets %>% group_by(sentiment) %>% summarise(n=n()) %>% mutate(Percentage=(n/sum(n))*100)
plot_ly() %>%
add_pie( data= pi,
labels=pi$sentiment,
values = pi$Percentage,
name = "") %>% layout(title = 'Percentage Sentiment in Tweets',
xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
We see that most of the tweets were neutral in nature, followed by positive and negative ones.
To answer this question, we look into what words make up negative tweets.
words_neg <- subset(data_tweets,sentiment=="Negative"|sentiment=="Very Negative")
temp <- words_neg[,c("clean_text_2","sentiment","X")]
words <- temp %>%
unnest_tokens(word,clean_text_2)%>%
filter(!word %in% stop_words$word,
str_detect(word,"^[a-z']+$"))
for(i in 1:dim(words)[1])
{
words$sentiment[i] <- as.character(calculate_sentiment(words$word[i])$sentiment)
}
words %>% filter(sentiment=="Negative"| sentiment=="Very Negative") %>% group_by(word) %>% summarise(n=n()) %>% arrange(desc(n)) %>% filter(n>10) %>%
hchart("column",x=word,y=n)%>%
hc_xAxis(labels = list(rotation = -90, step = 1))
Words like “emergency”,“shame”,“pig”,“evil”,“bs”,“terrible” come under this category.
ax <- list(
zeroline=FALSE,
showline=FALSE,
showticklabels=FALSE,
showgrid=FALSE
)
temp <- data_tweets %>% group_by(picture,sentiment) %>% summarise(n=n())
temp_pic <- subset(temp,picture=="Picture/link")
temp_no_pic <- subset(temp,picture=="No picture/link")
temp_pic <- temp_pic %>% mutate(percentage=(n/sum(n))*100)
temp_no_pic <- temp_no_pic %>% mutate(percentage=(n/sum(n))*100)
pie_chart_1 <- plot_ly() %>%
add_pie(data = temp_pic,
labels=temp_pic$sentiment,
values = temp_pic$percentage,
name = "By Sentiment",
domain = list(x = c(0.52, 1), y = c(0.5, 1)))%>%
add_pie(data = temp_no_pic,
labels=temp_no_pic$sentiment,
values = temp_no_pic$percentage,
name = "By Sentiment",
domain = list(x = c(0, 0.48), y = c(0.5, 1))) %>%
layout(title = "Picture and No Picture",
xaxis=ax,
yaxis=ax)
pie_chart_1
#pie_chart_2
library(DT)
temp <- data_tweets[,c("clean_text_2","sentiment","X")]
words <- temp %>%
unnest_tokens(word,clean_text_2)%>%
filter(!word %in% stop_words$word,
str_detect(word,"^[a-z']+$"))
words %>% group_by(word) %>% summarise(n=n()) %>% filter(n>250) %>% arrange(desc(n)) %>%
hchart("column",x=word,y=log(n))%>%
hc_xAxis(labels = list(rotation = -90, step = 1))
#DT::datatable(words %>% group_by(word) %>% summarise(n=n()) %>% filter(n>100) %>% arrange(desc(n)))